#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Validate that sacct -D shows correct job steps and states
#          when a job is requeued
############################################################################
# Copyright (C) 2014 SchedMD LLC
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set job_id     0
set node       [get_nodes_by_request -fail "-t1 --exclusive"]

set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
	skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
}
if {![is_super_user]} {
	skip "Test can only be run as SlurmUser"
}

proc cleanup {} {
	global job_id

	cancel_job $job_id
}

proc mod_state { state reason } {
	global scontrol node

	set output [run_command_output "$scontrol update nodename=$node state=$state reason=$reason"]
	set bad_state [regexp "Invalid node state specified" $output]

	if {$bad_state == 1 && $state eq "resume" && [get_config_param "ReturnToService"] == 2} {
		log_warn "This error is expected, no worries"
		set bad_state 0
	}
	if {$bad_state == 1} {
		fail "Problem changing node state"
	}
}

proc check_step { num } {
	global sacct job_id

	set output [run_command_output -fail "$sacct --job=$job_id\.batch -D --start=now-15minutes --noheader --format=jobid -P"]
	set steps [regexp -all "batch" $output]

	subtest {$num == $steps} "Check number of steps" "$steps != $num"
}

# Count the number of jobs and steps with a specific job ID and state
proc check_sacct_states {states} {
	global job_id sacct

	# This test will requeue jobs making those jobs be eligible in the
	# future from sacct's perspective.  Since sacct only shows eligible
	# jobs we have to specify end in the future.
	set output [run_command_output -fail "$sacct --job=$job_id --duplicates --parsable2 --start=now-15minutes --end=tomorrow --noheader -o JobID,State"]
	# NOTE: Skip "extern" job container optionally spawned by "PrologFlags=contain"
	set state_num [regexp -all "\[0-9_\]+(\.(?!extern)\[a-z\]+)*\\|$states" $output]

	return $state_num
}

if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
	skip "Not using accounting_storage/slurmdbd"
}

# Submit job to be requeued
log_info "Test 1"
set job_id [submit_job -fail "-N1 -w$node --exclusive -o/dev/null --requeue --wrap='$bin_sleep 20'"]

wait_for_job -fail $job_id "RUNNING"

# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5

# Set the node that the job is running on to down
mod_state "down" "$test_name"

# Wait a little bit for node state to change
sleep 5

# Set the node back to resume
mod_state "resume" "$test_name"

# Check the job state
log_info "Test 2"
wait_for_job -fail $job_id "PENDING"

# Wait for the state changes to propagate to the database for sacct
sleep 5
# The job state should be NODE_FAIL
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"

# The batch step state should be CANCELLED
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1"

# The requeued job state should be PENDING
set pend_count [check_sacct_states "PENDING"]
subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1"

wait_for_job -fail $job_id "RUNNING"

# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5


log_info "Test 3"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"

set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 1} "Test CANCELLED count" "$canc_count != 1"

set run_count [check_sacct_states "RUNNING"]
# The requeued job and its batch step should now be running.
subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2"

# Requeue the job
run_command -fail "$scontrol requeue $job_id"

# Wait a bit for the job to be requeued then check its state
sleep 8
wait_for_job -fail $job_id "PENDING"

# Wait for the state changes to propagate to the database for sacct
sleep 5
log_info "Test 4"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"

set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"

# The first and second batch steps should both show CANCELLED
set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"

set pend_count [check_sacct_states "PENDING"]
subtest {$pend_count == 1} "Test PENDING count" "$pend_count != 1"

wait_for_job -fail $job_id "RUNNING"

# Wait for batch script to start (after message delays, prologs, etc.)
sleep 5

# Check for steps after requeue. There should be 3 batch steps - the first 2
# that are CANCELLED, and now the last one that is running.
check_step 3


log_info "Test 5"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"

set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"

set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"

# The job and its batch step should be RUNNING
set run_count [check_sacct_states "RUNNING"]
subtest {$run_count == 2} "Test RUNNING count" "$run_count != 2"

wait_for_job -fail $job_id "DONE"

# Check steps after job has completed
check_step 3
log_info "Test 6"
set fail_count [check_sacct_states "NODE_FAIL"]
subtest {$fail_count == 1} "Test NODE_FAIL count" "$fail_count != 1"

set req_count [check_sacct_states "REQUEUE"]
subtest {$req_count == 1} "Test REQUEUE count" "$req_count != 1"

set canc_count [check_sacct_states "CANCELLED"]
subtest {$canc_count == 2} "Test CANCELLED count" "$canc_count != 2"

set comp_count [check_sacct_states "COMPLETED"]
subtest {$comp_count == 2} "Test COMPLETED count" "$comp_count != 2"
